---
title: "Study 4 Cleaning"
author: "Kaylyn Jackson Schiff, Daniel Schiff, and Natalia Bueno"
date: "2022"
output: pdf_document
editor_options: 
  chunk_output_type: console
---

#####Setup Chunk
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(rmarkdown)
require(skimr)
require(readr)
require(tidyverse)
require(estimatr)
options(scipen=999)
```

#####Functions
```{r functions}
set.seed(1234)
calculate_mean_effects_index <- function(Z, outcome_mat, to_reorient, reorient = FALSE, greedy = TRUE,
                                         impute = FALSE){
  if(length(Z) != nrow(outcome_mat)) stop("Error: Treatment assignment, outcome matrix require same n!")
  if(impute == TRUE){
    R <- 1 * is.na(outcome_mat)
    means_for_imputation <- rbind(apply(outcome_mat[Z==0,], MAR = 2, FUN = mean, na.rm = T),
                                  apply(outcome_mat[Z==1,], MAR = 2, FUN = mean, na.rm = T))
    to_impute <- R * means_for_imputation[Z+1,]
    outcome_mat[is.na(outcome_mat)] <- 0
    outcome_mat <- outcome_mat + to_impute
  }
  c_mean <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = mean, na.rm = T)
  c_sd <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = sd, na.rm = T)
  z_score <- t(t(sweep(outcome_mat, 2, c_mean))/ c_sd)
  index_numerator <- rowSums(z_score)
  if(greedy == TRUE){
    n_outcomes <- rowSums(!is.na(z_score))
  }
  else if(greedy == FALSE){
    n_outcomes <- ncol(outcome_mat)
  }
  index <- index_numerator/n_outcomes
  index <-  (index - mean(index[Z==0], na.rm =T))/sd(index[Z==0], na.rm =T)
  return(index)
}
```

#####Import Data
```{r import data}
#df_names <- read_csv("data/prolific_sample_11.28.22.csv", n_max = 0) %>% names()
#df_raw <- read_csv("data/prolific_sample_11.28.22.csv", col_names = df_names, skip = 3)
#df <- df_raw

#Add in demographic data from Prolific
# Not available due to PII concerns
#df_cov_1 <- read.csv("data/prolific_demos_1.csv")
#df_cov_2 <- read.csv("data/prolific_demos_2.csv")
#df_cov_3 <- read.csv("data/prolific_demos_3.csv")
#df_cov <- rbind(df_cov_1, df_cov_2, df_cov_3)
#df <- df %>% left_join(df_cov, by = c("PROLIFIC_PID" = "Participant.id"))

#Recode deception_consent here

#Is there differential attrition on deception consent form based on treatment?
#df$Control <- if_else(is.na(df$Control)==T, 0, 1)
#df$IU <- if_else(is.na(df$IU)==T, 0, 1)
#df$OR <- if_else(is.na(df$OR)==T, 0, 1)
#df$video <- if_else(is.na(df$video)==T, 0, 1)
#summary(lm_robust(deception_consent ~ Control, data = df))
#summary(lm_robust(deception_consent ~ IU, data = df))
#summary(lm_robust(deception_consent ~ OR, data = df))
#summary(lm_robust(deception_consent ~ video, data = df))

#Filter to only people who gave us consent to use their data after being notified of deception
#length(which(df$deception_consent_2 == 0)) #number of people that will be filtered out
#df <- df %>% filter(deception_consent_2 != 0 | is.na(deception_consent_2))
#Removing due to PII concerns
#df <- df %>% select(-PROLIFIC_PID, -Submission.id, -Fluent.languages)

#saveRDS(df, "data/prolific_sample_11.28.22.rds")

df <- readRDS("data/prolific_sample_11.28.22.rds")

```

#####Explore Data
```{r explore data}
df
glimpse(df)
colnames(df)
```

#####Subset Data
```{r subset data}
df <- df %>% dplyr::select(ResponseId,                                                #respondent data
                      screener_1_1:screener_1_21, screener_2_1:screener_2_6,            #screening questions
                      
                      politician,                                                       #politician name
                      video, text,
                      clicked_jesse, clicked_john, clicked_tim, clicked_todd,            #media format
                      IU, OR, Control,                                                  #allegations  
                      
                      belief_1, belief_2,                                               #belief outcomes
                      support_1, support_2, support_3, support_4,                       #support outcomes
                      trust_1, trust_2,                                                 #trust outcomes
                      
                      party, gender = Sex, race, age = Age, education_2, income, region,               #demographics
                      literacy1, literacy2, literacy3, familiar_df                      #news media and digital literacy
)                 

skim(df)                                                   #summary statistics and class
```

#####Recode and Create Variables
```{r recode and create new variables}
#Create attentiveness index
df <- df %>% mutate(screener_1 = if_else((!is.na(screener_1_21) & screener_1_21 == 1) &
                                                  is.na(screener_1_1) ==  TRUE &
                                                  is.na(screener_1_2) ==  TRUE &
                                                  is.na(screener_1_3) ==  TRUE &
                                                  is.na(screener_1_4) ==  TRUE &
                                                  is.na(screener_1_5) ==  TRUE &
                                                  is.na(screener_1_6) ==  TRUE &
                                                  is.na(screener_1_7) ==  TRUE &
                                                  is.na(screener_1_8) ==  TRUE &
                                                  is.na(screener_1_9) ==  TRUE &
                                                  is.na(screener_1_10) == TRUE &
                                                  is.na(screener_1_11) == TRUE &
                                                  is.na(screener_1_12) == TRUE &
                                                  is.na(screener_1_13) == TRUE &
                                                  is.na(screener_1_14) == TRUE &
                                                  is.na(screener_1_15) == TRUE &
                                                  is.na(screener_1_16) == TRUE &
                                                  is.na(screener_1_17) == TRUE &
                                                  is.na(screener_1_18) == TRUE &
                                                  is.na(screener_1_19) == TRUE &
                                                  is.na(screener_1_20) == TRUE, 1, 0))

df <- df %>% dplyr::mutate(screener_2 = if_else(is.na(screener_2_1) &
                                         is.na(screener_2_2) &
                                         (!is.na(screener_2_3) & screener_2_3 == 1) &
                                         is.na(screener_2_4) &
                                         (!is.na(screener_2_5) & screener_2_5 == 1)&
                                         is.na(screener_2_6), 1, 0))

df <- df %>% dplyr::mutate(attentiveness = rowSums(dplyr::select(df, screener_1, screener_2))) #create index

#Create variables for treatment assignments
df <- df %>% mutate(alleg_treatment = case_when(
  Control == 1 ~ "Control",
  IU == 1 ~ "IU",
  OR == 1 ~ "OR"
))
df$alleg_treatment <- factor(df$alleg_treatment, levels = c("Control","IU","OR"), 
                     labels = c("Control","Info. Uncertain","Opp. Rally"), ordered = FALSE)

df <- df %>% mutate(media_format = case_when(
  video == 1 ~ "Video",
  text == 1 ~ "Text"
))
df$media_format <- factor(df$media_format, levels = c("Text","Video"), 
                     labels = c("Text","Video"), ordered = FALSE)

#Check randomization
table(df$alleg_treatment)
table(df$media_format)
table(df$politician)

#Recode political ideology
df <- df %>% mutate(party_3 = case_when(
  party == 1 | party == 2 ~ 1,                 # "Democrat",
  party == 3 | party == 4 | party == 5 ~ 2,    # "Independent",
  party == 6 | party == 7 ~ 3                  # "Republican"
))
df$party_3 <- factor(df$party_3, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df <- df %>% mutate(party_3_lean = case_when(
  party == 1 | party == 2 | party==3 ~ 1,                 # "Democrat",
  party == 4 ~ 2,    # "Independent",
  party == 5 | party == 6 | party == 7 ~ 3                  # "Republican"
))
df$party_3_lean <- factor(df$party_3_lean, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df$party <- factor(df$party, levels = c(4,1,2,3,5,6,7), 
                     labels = c("Independent","Strong Democrat","Democrat","Lean Democrat",
                                "Lean Republican","Republican","Strong Republican"), ordered = FALSE)

#Create strong co-partisanship variable
df <- df %>% mutate(copartisan = case_when(
  party_3 == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1, #co-partisans
  party_3 == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ 0, #anti-partisans
  party_3 == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 0,
  party_3 == "Independent" ~ 0))

#Create moderate partisanship variable
df <- df %>% mutate(moderate = if_else(party_3 == "Independent", 1, 0))

#Create anti-partisanship variable
df <- df %>% mutate(antipartisan = case_when(
  party_3 == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 0,
  party_3 == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ 1,
  party_3 == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 0, 
  party_3 == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  party_3 == "Independent" ~ 0))

#Create partisanship variable
df <- df %>% mutate(partisan = case_when(
  party_3 == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  party_3 == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ -1,
  party_3 == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ -1,
  party_3 == "Independent" ~ 0))

#Create partisanship variable with leaners
df <- df %>% mutate(partisan_lean = case_when(
  party_3_lean == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  party_3_lean == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ -1,
  party_3_lean == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  party_3_lean == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ -1,
  party_3_lean == "Independent" ~ 0))

#Recode gender
df$gender <- factor(df$gender, levels = c("Male", "Female"), 
                     labels = c("Male","Female"), ordered = FALSE)

#Recode race/ethnicity
df$race <- factor(df$race, levels = c(1,2,3,4,5), 
                  labels = c("White", "Black", "Hispanic", "Asian", "Other"), ordered = FALSE)

#Recode age
df$age <- as.numeric(as.character(df$age))
df <- df %>% mutate(age2 = case_when(
  age >= 2020-1945                    ~ 5,   
  age <= 2020-1946 & age >= 2020-1964 ~ 4,   
  age <= 2020-1965 & age >= 2020-1980 ~ 3,       
  age <= 2020-1981 & age >= 2020-1996 ~ 2, 
  age <= 2020-1997                    ~ 1          
  ))
df$age <- factor(df$age2, levels = c(1:5),
                     labels = c("Gen Z", "Millennials", "Gen X", "Boomers", "Silent"), ordered = FALSE)

#Recode education
df <- df %>% mutate(education = case_when(
  education_2 <= 2 ~ 1,                      #High school graduate or less
  education_2 >= 3 & education_2 <= 4 ~ 2,   #Some college or Associate degree
  education_2 == 5 ~ 3,                      #Bachelor's degree
  education_2 >= 6 ~ 4                       #Graduate degree 
  ))
df$education <- factor(df$education, levels = c(1,2,3,4), 
                  labels = c("High school graduate or less","Some college","Bachelor's degree","Graduate degree"), 
                  ordered = FALSE)

#Recode income
df$income <- ifelse(df$income == 25, mean(df$income[df$income != 25]), df$income) #recode missing values to mean
df <- df %>% mutate(income = case_when(
  income <= 4 & income >= 1 ~ 1,           #Under $30k
  income >= 5 & income <= 13 ~ 2,          #$30k to under $75k
  income >= 14 ~ 3                      #At least $75k
  ))
df$income <- factor(df$income, levels = c(2,1,3), 
                  labels = c("Middle income","Low income","High income"), 
                  ordered = FALSE)

#Recode region
df$region <- factor(df$region, levels = c(1:4), 
                  labels = c("Northeast","Midwest","South","West"), 
                  ordered = FALSE)

#Create variable for whether watched video assigned
df$watched_video <- case_when(
  df$media_format == "Text" ~ NA_real_,
  df$media_format == "Video" & df$politician == "Jesse Jackson" & df$clicked_jesse == 1 ~ 1,
  df$media_format == "Video" & df$politician == "Jesse Jackson" & is.na(df$clicked_jesse) == T ~ 0,
  df$media_format == "Video" & df$politician == "John Murtha" & df$clicked_john == 1 ~ 1,
  df$media_format == "Video" & df$politician == "John Murtha" & is.na(df$clicked_john) == T ~ 0,
  df$media_format == "Video" & df$politician == "Tim James" & df$clicked_tim == 1 ~ 1,
  df$media_format == "Video" & df$politician == "Tim James" & is.na(df$clicked_tim) == T ~ 0,
  df$media_format == "Video" & df$politician == "Todd Akin" & df$clicked_todd == 1 ~ 1,
  df$media_format == "Video" & df$politician == "Todd Akin" & is.na(df$clicked_todd) == T ~ 0
)

#How many watched video?
table(df$watched_video)

#Create news media literacy index variable
df <- df %>% mutate(literacy1_correct = if_else(literacy1==2, 1, 0),
                          literacy2_correct = if_else(literacy2==4, 1, 0),
                          literacy3_correct = if_else(literacy3==2, 1, 0))
df <- df %>% mutate(media_literacy = 
                      rowSums(dplyr::select(df, literacy1_correct, literacy2_correct, literacy3_correct)))

#Rename digital literacy
df <- df %>% rename(digital_literacy = familiar_df)

#Create index for belief
df <- df %>% mutate(treat = ifelse(alleg_treatment == "Control", 0, 1))
mat <- df %>% dplyr::select(belief_1, belief_2)
df <- df %>% mutate(belief = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))

#Create index for support
mat <- df %>% dplyr::select(support_1, support_2, support_3, support_4)
df <- df %>% mutate(support = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))

mat <- df %>% dplyr::select(support_1, support_2, support_3)
df <- df %>% mutate(support_nodonation = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))

#Create index for trust
mat <- df %>% dplyr::select(trust_1, trust_2)
df <- df %>% mutate(trust = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))


#Reselect variables
colnames(df)
df <- df %>% dplyr::select(  ResponseId,                                                #respondent data
                      attentiveness,                                                    #attentiveness index
                      
                      politician,                                                       #politician name
                      media_format, watched_video,                                          #media format
                      alleg_treatment,                                                  #allegations 
                      
                      copartisan, moderate, antipartisan, 
                      party_3, party_3_lean,
                      partisan, partisan_lean,         #partisanship indicators
                      
                      belief, support, support_nodonation, trust,                           #outcomes
                      belief_1:trust_2,
                      
                      party, gender, race, age, education, income, region,              #demographics
                      media_literacy, digital_literacy                                  #news media and digital literacy
) 

df <- df %>% filter(is.na(alleg_treatment)==F)

```

#####Export File
```{r export file}
write_rds(df, path = "data/df_study4_clean.rds")
```


